import nltk #for documents
import string
import re
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style = 'white')
from nltk.stem.porter import*
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction import _stop_words
# documents cleaning
from collections import Counter
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
# plot
import matplotlib
import plotly.offline as py #interchange
py.init_notebook_mode
import plotly.graph_objs as go
import plotly.tools as tls
%matplotlib inline
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure, show, output_notebook
import warnings
warnings.filterwarnings('ignore')
import logging
logging.getLogger('1da').setLevel(logging.WARNING)
sample = pd.read_csv('sample.csv')
sample.head()
| Uniq Id | Product Name | Brand Name | Asin | Category | Upc Ean Code | List Price | Selling Price | Quantity | Model Number | ... | Product Url | Stock | Product Details | Dimensions | Color | Ingredients | Direction To Use | Is Amazon Seller | Size Quantity Variant | Product Description | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 4c69b61db1fc16e7013b43fc926e502d | DB Longboards CoreFlex Crossbow 41" Bamboo Fib... | NaN | NaN | Sports & Outdoors | Outdoor Recreation | Skate... | NaN | NaN | $237.68 | NaN | NaN | ... | https://www.amazon.com/DB-Longboards-CoreFlex-... | NaN | NaN | NaN | NaN | NaN | NaN | Y | NaN | NaN |
| 1 | 66d49bbed043f5be260fa9f7fbff5957 | Electronic Snap Circuits Mini Kits Classpack, ... | NaN | NaN | Toys & Games | Learning & Education | Science ... | NaN | NaN | $99.95 | NaN | 55324 | ... | https://www.amazon.com/Electronic-Circuits-Cla... | NaN | NaN | NaN | NaN | NaN | NaN | Y | NaN | NaN |
| 2 | 2c55cae269aebf53838484b0d7dd931a | 3Doodler Create Flexy 3D Printing Filament Ref... | NaN | NaN | Toys & Games | Arts & Crafts | Craft Kits | NaN | NaN | $34.99 | NaN | NaN | ... | https://www.amazon.com/3Doodler-Plastic-Innova... | NaN | NaN | NaN | NaN | NaN | NaN | Y | NaN | NaN |
| 3 | 18018b6bc416dab347b1b7db79994afa | Guillow Airplane Design Studio with Travel Cas... | NaN | NaN | Toys & Games | Hobbies | Models & Model Kits |... | NaN | NaN | $28.91 | NaN | 142 | ... | https://www.amazon.com/Guillow-Airplane-Design... | NaN | NaN | NaN | NaN | NaN | NaN | Y | NaN | NaN |
| 4 | e04b990e95bf73bbe6a3fa09785d7cd0 | Woodstock- Collage 500 pc Puzzle | NaN | NaN | Toys & Games | Puzzles | Jigsaw Puzzles | NaN | NaN | $17.49 | NaN | 62151 | ... | https://www.amazon.com/Woodstock-Collage-500-p... | NaN | NaN | NaN | NaN | NaN | NaN | Y | NaN | NaN |
5 rows × 28 columns
sample.shape
(10002, 28)
sample.dtypes
Uniq Id object Product Name object Brand Name float64 Asin float64 Category object Upc Ean Code object List Price float64 Selling Price object Quantity float64 Model Number object About Product object Product Specification object Technical Details object Shipping Weight object Product Dimensions object Image object Variants object Sku float64 Product Url object Stock float64 Product Details float64 Dimensions float64 Color float64 Ingredients float64 Direction To Use float64 Is Amazon Seller object Size Quantity Variant float64 Product Description float64 dtype: object
# Take the columns that we are interested
sample = sample[['Category', 'Brand Name', 'Selling Price', 'About Product', 'Shipping Weight']]
sample.head()
| Category | Brand Name | Selling Price | About Product | Shipping Weight | |
|---|---|---|---|---|---|
| 0 | Sports & Outdoors | Outdoor Recreation | Skate... | NaN | $237.68 | Make sure this fits by entering your model num... | 10.7 pounds |
| 1 | Toys & Games | Learning & Education | Science ... | NaN | $99.95 | Make sure this fits by entering your model num... | 4 pounds |
| 2 | Toys & Games | Arts & Crafts | Craft Kits | NaN | $34.99 | Make sure this fits by entering your model num... | 12.8 ounces |
| 3 | Toys & Games | Hobbies | Models & Model Kits |... | NaN | $28.91 | Make 8 different Planes at one time. | Experim... | 13.4 ounces |
| 4 | Toys & Games | Puzzles | Jigsaw Puzzles | NaN | $17.49 | Make sure this fits by entering your model num... | 13.4 ounces |
# note that some prices are stored as dict or (ex.70-100) and we can not generate an accurate
# refecltion on these prices, good news is there aren't too much of them, so we are just going
# ignore these for now
sample['Selling Price'] = sample['Selling Price'].str.replace('$', '')
sample['Selling Price'] = pd.to_numeric(sample['Selling Price'], errors='coerce')
sample
| Category | Brand Name | Selling Price | About Product | Shipping Weight | |
|---|---|---|---|---|---|
| 0 | Sports & Outdoors | Outdoor Recreation | Skate... | NaN | 237.68 | Make sure this fits by entering your model num... | 10.7 pounds |
| 1 | Toys & Games | Learning & Education | Science ... | NaN | 99.95 | Make sure this fits by entering your model num... | 4 pounds |
| 2 | Toys & Games | Arts & Crafts | Craft Kits | NaN | 34.99 | Make sure this fits by entering your model num... | 12.8 ounces |
| 3 | Toys & Games | Hobbies | Models & Model Kits |... | NaN | 28.91 | Make 8 different Planes at one time. | Experim... | 13.4 ounces |
| 4 | Toys & Games | Puzzles | Jigsaw Puzzles | NaN | 17.49 | Make sure this fits by entering your model num... | 13.4 ounces |
| ... | ... | ... | ... | ... | ... |
| 9997 | Toys & Games | Learning & Education | Counting... | NaN | 9.31 | NaN | 4.8 ounces |
| 9998 | Toys & Games | Arts & Crafts | NaN | 6.99 | Make sure this fits by entering your model num... | 0.96 ounces |
| 9999 | Office Products | Office & School Supplies | E... | NaN | 37.95 | Make sure this fits by entering your model num... | 2.8 pounds |
| 10000 | Toys & Games | Arts & Crafts | Craft Kits | NaN | 3.58 | NaN | 6.1 ounces |
| 10001 | Home & Kitchen | Furniture | Kids' Furniture |... | NaN | 152.27 | Make sure this fits by entering your model num... | 20.5 pounds |
10002 rows × 5 columns
Shipping Weight to a same scale, chose to use Ounces here¶sample['Shipping Weight']
0 10.7 pounds
1 4 pounds
2 12.8 ounces
3 13.4 ounces
4 13.4 ounces
...
9997 4.8 ounces
9998 0.96 ounces
9999 2.8 pounds
10000 6.1 ounces
10001 20.5 pounds
Name: Shipping Weight, Length: 10002, dtype: object
sample['Shipping Weight'] = sample['Shipping Weight'].str.lower() #lower case
# find all ounces and typecast to floats
transfer_ounces = sample[sample["Shipping Weight"].str.contains('ounces', na=False)]['Shipping Weight'].str.replace(' ounces', '')
sample["weights_ounces"] = pd.to_numeric(transfer_ounces, errors='coerce')
# find all pounds and typecast to floats
transfer_pounds = sample[sample["Shipping Weight"].str.contains('pounds', na=False)]['Shipping Weight'].str.replace(' pounds', '')
# change pounds to ounces
sample["pounds"] = pd.to_numeric(transfer_pounds, errors='coerce')*16
sample["weights_ounces"] = sample["weights_ounces"].fillna(sample["pounds"])
sample = sample.drop(columns = ['pounds', 'Shipping Weight'])
sample
| Category | Brand Name | Selling Price | About Product | weights_ounces | |
|---|---|---|---|---|---|
| 0 | Sports & Outdoors | Outdoor Recreation | Skate... | NaN | 237.68 | Make sure this fits by entering your model num... | 171.20 |
| 1 | Toys & Games | Learning & Education | Science ... | NaN | 99.95 | Make sure this fits by entering your model num... | 64.00 |
| 2 | Toys & Games | Arts & Crafts | Craft Kits | NaN | 34.99 | Make sure this fits by entering your model num... | 12.80 |
| 3 | Toys & Games | Hobbies | Models & Model Kits |... | NaN | 28.91 | Make 8 different Planes at one time. | Experim... | 13.40 |
| 4 | Toys & Games | Puzzles | Jigsaw Puzzles | NaN | 17.49 | Make sure this fits by entering your model num... | 13.40 |
| ... | ... | ... | ... | ... | ... |
| 9997 | Toys & Games | Learning & Education | Counting... | NaN | 9.31 | NaN | 4.80 |
| 9998 | Toys & Games | Arts & Crafts | NaN | 6.99 | Make sure this fits by entering your model num... | 0.96 |
| 9999 | Office Products | Office & School Supplies | E... | NaN | 37.95 | Make sure this fits by entering your model num... | 44.80 |
| 10000 | Toys & Games | Arts & Crafts | Craft Kits | NaN | 3.58 | NaN | 6.10 |
| 10001 | Home & Kitchen | Furniture | Kids' Furniture |... | NaN | 152.27 | Make sure this fits by entering your model num... | 328.00 |
10002 rows × 5 columns
sample['weights_ounces'].isna().sum()
1140
weights_ounces can be stratified into a few groups¶(sample['weights_ounces']).plot.hist(bins=50,figsize=(10,5), edgecolor = 'white', range=[0,250])
plt.xlabel('Weights in Ounces')
plt.ylabel('frequency')
plt.tick_params(labelsize=15)
plt.title('Product Weights Distribution')
plt.show()
stratified_groups = [ #label groups
'0-14',
'15-29',
'30-44',
'45-69',
'70-84',
'85-99',
'100-114',
'114-'
]
# stratify
bins = pd.Series(stratified_groups).str.split('-').str[0].astype(int).to_list()
bins.append(float('inf'))
labels = stratified_groups
sample['weights_ounces'] = pd.cut(sample['weights_ounces'], bins=bins, labels=labels, right=False)
sample
| Category | Brand Name | Selling Price | About Product | weights_ounces | |
|---|---|---|---|---|---|
| 0 | Sports & Outdoors | Outdoor Recreation | Skate... | NaN | 237.68 | Make sure this fits by entering your model num... | 114- |
| 1 | Toys & Games | Learning & Education | Science ... | NaN | 99.95 | Make sure this fits by entering your model num... | 45-69 |
| 2 | Toys & Games | Arts & Crafts | Craft Kits | NaN | 34.99 | Make sure this fits by entering your model num... | 0-14 |
| 3 | Toys & Games | Hobbies | Models & Model Kits |... | NaN | 28.91 | Make 8 different Planes at one time. | Experim... | 0-14 |
| 4 | Toys & Games | Puzzles | Jigsaw Puzzles | NaN | 17.49 | Make sure this fits by entering your model num... | 0-14 |
| ... | ... | ... | ... | ... | ... |
| 9997 | Toys & Games | Learning & Education | Counting... | NaN | 9.31 | NaN | 0-14 |
| 9998 | Toys & Games | Arts & Crafts | NaN | 6.99 | Make sure this fits by entering your model num... | 0-14 |
| 9999 | Office Products | Office & School Supplies | E... | NaN | 37.95 | Make sure this fits by entering your model num... | 30-44 |
| 10000 | Toys & Games | Arts & Crafts | Craft Kits | NaN | 3.58 | NaN | 0-14 |
| 10001 | Home & Kitchen | Furniture | Kids' Furniture |... | NaN | 152.27 | Make sure this fits by entering your model num... | 114- |
10002 rows × 5 columns
sample['weights_ounces'].value_counts()
0-14 4982 15-29 1841 114- 688 30-44 626 45-69 379 70-84 172 85-99 91 100-114 83 Name: weights_ounces, dtype: int64
sample['Selling Price'].describe()
count 9484.000000 mean 34.314697 std 66.115053 min 0.010000 25% 9.990000 50% 16.900000 75% 29.990000 max 945.990000 Name: Selling Price, dtype: float64
Selling Price look more normally distributed¶plt.subplot(1, 2, 1) #side by side plot
(sample['Selling Price']).plot.hist(bins=50,figsize=(10,5), edgecolor = 'white', range=[0,150])
plt.xlabel('price')
plt.ylabel('frequency')
plt.tick_params(labelsize=15)
plt.title('Price Distribution - original')
plt.subplot(1, 2, 2)
np.log(sample['Selling Price']+1).plot.hist(bins=50,figsize=(10,5), edgecolor = 'white')
plt.xlabel('log+1 price')
plt.ylabel('frequency')
plt.tick_params(labelsize=15)
plt.title('Log Price Distribution')
# fig, axes = plt.subplots(nrows=1,ncols=2,figsize=(12,6))
plt.show()
sample['log_prices'] = np.log(sample['Selling Price']+1)
sample
| Category | Brand Name | Selling Price | About Product | weights_ounces | log_prices | |
|---|---|---|---|---|---|---|
| 0 | Sports & Outdoors | Outdoor Recreation | Skate... | NaN | 237.68 | Make sure this fits by entering your model num... | 114- | 5.475124 |
| 1 | Toys & Games | Learning & Education | Science ... | NaN | 99.95 | Make sure this fits by entering your model num... | 45-69 | 4.614625 |
| 2 | Toys & Games | Arts & Crafts | Craft Kits | NaN | 34.99 | Make sure this fits by entering your model num... | 0-14 | 3.583241 |
| 3 | Toys & Games | Hobbies | Models & Model Kits |... | NaN | 28.91 | Make 8 different Planes at one time. | Experim... | 0-14 | 3.398193 |
| 4 | Toys & Games | Puzzles | Jigsaw Puzzles | NaN | 17.49 | Make sure this fits by entering your model num... | 0-14 | 2.917230 |
| ... | ... | ... | ... | ... | ... | ... |
| 9997 | Toys & Games | Learning & Education | Counting... | NaN | 9.31 | NaN | 0-14 | 2.333114 |
| 9998 | Toys & Games | Arts & Crafts | NaN | 6.99 | Make sure this fits by entering your model num... | 0-14 | 2.078191 |
| 9999 | Office Products | Office & School Supplies | E... | NaN | 37.95 | Make sure this fits by entering your model num... | 30-44 | 3.662279 |
| 10000 | Toys & Games | Arts & Crafts | Craft Kits | NaN | 3.58 | NaN | 0-14 | 1.521699 |
| 10001 | Home & Kitchen | Furniture | Kids' Furniture |... | NaN | 152.27 | Make sure this fits by entering your model num... | 114- | 5.032201 |
10002 rows × 6 columns
weights of product affect Selling Price?¶sample['weights_ounces'].value_counts()
0-14 4982 15-29 1841 114- 688 30-44 626 45-69 379 70-84 172 85-99 91 100-114 83 Name: weights_ounces, dtype: int64
# Note that we ingnored a few groups because the sample size for those are too small
group1 = sample.loc[sample['weights_ounces'] == '0-14', 'Selling Price']
group2 = sample.loc[sample['weights_ounces'] == '15-29', 'Selling Price']
group3 = sample.loc[sample['weights_ounces'] == '30-44', 'Selling Price']
group4 = sample.loc[sample['weights_ounces'] == '45-69', 'Selling Price']
group5 = sample.loc[sample['weights_ounces'] == '114-', 'Selling Price']
Selling Price tends to be higher when weights is larger¶fig, ax = plt.subplots(figsize = (20,10))
ax.hist(np.log(group1+1), color = '#8CB4E1', alpha = 1.0, bins = 50,
label = 'Prices for 0-14 ounces')
ax.hist(np.log(group2+1), color = '#007D00', alpha = 0.7, bins = 50,
label = 'Prices for 15-29 ounces')
ax.hist(np.log(group3+1), color = 'red', alpha = 0.7, bins = 50,
label = 'Prices for 30-44 ounces')
ax.hist(np.log(group4+1), color = 'purple', alpha = 0.7, bins = 50,
label = 'Prices for 45-69 ounces')
ax.hist(np.log(group5+1), color = 'yellow', alpha = 0.7, bins = 50,
label = 'Prices for 114+ ounces')
ax.set(title = 'Histogram Comparison', ylabel = '% of Dataset in Bins')
plt.legend()
plt.xlabel('log(selling price+1)')
plt.ylabel('frequency')
plt.title('Price Distribution by Weights')
plt.show()
sample.head()
| Category | Brand Name | Selling Price | About Product | weights_ounces | log_prices | |
|---|---|---|---|---|---|---|
| 0 | Sports & Outdoors | Outdoor Recreation | Skate... | NaN | 237.68 | Make sure this fits by entering your model num... | 114- | 5.475124 |
| 1 | Toys & Games | Learning & Education | Science ... | NaN | 99.95 | Make sure this fits by entering your model num... | 45-69 | 4.614625 |
| 2 | Toys & Games | Arts & Crafts | Craft Kits | NaN | 34.99 | Make sure this fits by entering your model num... | 0-14 | 3.583241 |
| 3 | Toys & Games | Hobbies | Models & Model Kits |... | NaN | 28.91 | Make 8 different Planes at one time. | Experim... | 0-14 | 3.398193 |
| 4 | Toys & Games | Puzzles | Jigsaw Puzzles | NaN | 17.49 | Make sure this fits by entering your model num... | 0-14 | 2.917230 |
print('There are %d unique values in Category column.' % sample['Category'].nunique())
There are 938 unique values in Category column.
## Top 5 Categories
sample['Category'].value_counts()[:5]
Toys & Games | Games & Accessories | Board Games 284 Toys & Games | Puzzles | Jigsaw Puzzles 274 Toys & Games | Stuffed Animals & Plush Toys | Stuffed Animals & Teddy Bears 252 Toys & Games | Toy Figures & Playsets | Action Figures 235 Toys & Games | Party Supplies 193 Name: Category, dtype: int64
print('There are %d items that dont have a label.' % sample['Category'].isnull().sum())
There are 830 items that dont have a label.
category has many sub-categories and we want to split them¶def split_cat(s):
try: return s.split('|')
except: return ('No Label', 'No Label', 'No Label') #if can't split
def split_cat2(s):
try: return s.split('&')
except: return ('No Label', 'No Label', 'No Label')
def catch(s):
try: return s[1]
except: return ('No Label')
sample['general_cate'], sample['sub_cate2'] = \
zip(*sample['Category'].apply(lambda x: split_cat(x)))
sample['general_cate'] = sample['general_cate'].apply(lambda x: split_cat2(x))
sample['sub_cate1'] = sample['general_cate'].apply(lambda x: catch(x))
sample['general_cate'] = sample['general_cate'].apply(lambda x: x[0])
sample.head()
| Category | Brand Name | Selling Price | About Product | weights_ounces | log_prices | general_cate | sub_cate2 | sub_cate1 | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Sports & Outdoors | Outdoor Recreation | Skate... | NaN | 237.68 | Make sure this fits by entering your model num... | 114- | 5.475124 | Sports | Outdoor Recreation | Outdoors |
| 1 | Toys & Games | Learning & Education | Science ... | NaN | 99.95 | Make sure this fits by entering your model num... | 45-69 | 4.614625 | Toys | Learning & Education | Games |
| 2 | Toys & Games | Arts & Crafts | Craft Kits | NaN | 34.99 | Make sure this fits by entering your model num... | 0-14 | 3.583241 | Toys | Arts & Crafts | Games |
| 3 | Toys & Games | Hobbies | Models & Model Kits |... | NaN | 28.91 | Make 8 different Planes at one time. | Experim... | 0-14 | 3.398193 | Toys | Hobbies | Games |
| 4 | Toys & Games | Puzzles | Jigsaw Puzzles | NaN | 17.49 | Make sure this fits by entering your model num... | 0-14 | 2.917230 | Toys | Puzzles | Games |
print('There are %d unique values in general category column.' % sample['general_cate'].nunique())
print('There are %d unique values in sub category1 column.' % sample['sub_cate1'].nunique())
print('There are %d unique values in sub category2 column.' % sample['sub_cate2'].nunique())
There are 23 unique values in general category column. There are 16 unique values in sub category1 column. There are 143 unique values in sub category2 column.
general category column and 16 unique values in sub1 category column and 143 unique values in sub category2 column. We are interested in their distribution¶#get the products and repective frequencies
x = sample['general_cate'].value_counts().index.values.astype('str')
y = sample['general_cate'].value_counts().values
pct = [("%.2f"%(v*100))+"%"for v in (y/len(sample))] # set the value for bar
Toys is the most popular general category¶#plot a bar
cate_di = go.Bar(x=x, y=y, text = pct,
marker = dict(
color = y,colorscale='Portland', showscale=True,
reversescale = False))# color bar
layout = dict(title = 'Number of Items by General Category',
yaxis = dict(title='Count'),
xaxis = dict(title='Category'))
fig=dict(data = [cate_di], layout=layout)
py.iplot(fig)
top 10 most popular sub-category1¶#get the products and repective frequencies
x = sample['sub_cate1'].value_counts().index.values.astype('str')[:10]
y = sample['sub_cate1'].value_counts().values[:10]
pct = [("%.2f"%(v*100))+"%"for v in (y/len(sample))][:10] # set the value for bar
Games is the most popular sub-category1, as 66.61%¶tracel = go.Bar(x=x, y=y, text=pct,
marker = dict(
color = y,colorscale='Portland', showscale=True,
reversescale = False))
layout = dict(title = 'Number of Items by Sub-Category (Top 10)',
yaxis = dict(title='Count'),
xaxis = dict(title = 'Sub-Category'))
fig = dict(data = [tracel], layout=layout)
py.iplot(fig)
Sub-Category2¶#get the products and repective frequencies
x = sample['sub_cate2'].value_counts().index.values.astype('str')[:10]
y = sample['sub_cate2'].value_counts().values[:10]
pct = [("%.2f"%(v*100))+"%"for v in (y/len(sample))][:10] # set the value for bar
tracel = go.Bar(x=x, y=y, text=pct,
marker = dict(
color = y,colorscale='Portland', showscale=True,
reversescale = False))
layout = dict(title = 'Number of Items by Sub-Category2 (Top 10)',
yaxis = dict(title='Count'),
xaxis = dict(title = 'Sub-Category2'))
fig = dict(data = [tracel], layout=layout)
py.iplot(fig)
Toys has the most spreaded prices across all General Category¶general_cats = sample['general_cate'].unique()
#prices for every cats
x = [sample.loc[sample['general_cate']==cat, 'log_prices'] for cat in general_cats]
#get the names and log_prices
#use box plot
data = [go.Box(x=x[i], name=general_cats[i]) for i in range(len(general_cats))]
layout = dict(title='Price Distribution by General Category',
xaxis = dict(title = 'Category'),
yaxis = dict(title = 'Frequency'))
fig = dict(data=data, layout=layout)
py.iplot(fig)
About Product affect Selling Price?¶!!! note that we are actually using log_prices instead of actual prices. See the explanation above
sample.head()
| Category | Brand Name | Selling Price | About Product | weights_ounces | log_prices | general_cate | sub_cate2 | sub_cate1 | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Sports & Outdoors | Outdoor Recreation | Skate... | NaN | 237.68 | Make sure this fits by entering your model num... | 114- | 5.475124 | Sports | Outdoor Recreation | Outdoors |
| 1 | Toys & Games | Learning & Education | Science ... | NaN | 99.95 | Make sure this fits by entering your model num... | 45-69 | 4.614625 | Toys | Learning & Education | Games |
| 2 | Toys & Games | Arts & Crafts | Craft Kits | NaN | 34.99 | Make sure this fits by entering your model num... | 0-14 | 3.583241 | Toys | Arts & Crafts | Games |
| 3 | Toys & Games | Hobbies | Models & Model Kits |... | NaN | 28.91 | Make 8 different Planes at one time. | Experim... | 0-14 | 3.398193 | Toys | Hobbies | Games |
| 4 | Toys & Games | Puzzles | Jigsaw Puzzles | NaN | 17.49 | Make sure this fits by entering your model num... | 0-14 | 2.917230 | Toys | Puzzles | Games |
# function to clear the text
def cleanWords(text):
try:
text = text.lower()
#get out all the punctuations
text = text.strip(string.punctuation)
pattern = 'r[\r\t\n]*'
clean = re.sub(pattern,' ', text)
# Don't want stop words cause not relate to the topic
words= [i for i in clean.split(' ')\
if not i in stopwords.words('english')]
return len(words)
except:
return 0
sample['words_len'] = sample['About Product'].apply(lambda x: cleanWords(x))
sample.head()
| Category | Brand Name | Selling Price | About Product | weights_ounces | log_prices | general_cate | sub_cate2 | sub_cate1 | words_len | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Sports & Outdoors | Outdoor Recreation | Skate... | NaN | 237.68 | Make sure this fits by entering your model num... | 114- | 5.475124 | Sports | Outdoor Recreation | Outdoors | 189 |
| 1 | Toys & Games | Learning & Education | Science ... | NaN | 99.95 | Make sure this fits by entering your model num... | 45-69 | 4.614625 | Toys | Learning & Education | Games | 100 |
| 2 | Toys & Games | Arts & Crafts | Craft Kits | NaN | 34.99 | Make sure this fits by entering your model num... | 0-14 | 3.583241 | Toys | Arts & Crafts | Games | 195 |
| 3 | Toys & Games | Hobbies | Models & Model Kits |... | NaN | 28.91 | Make 8 different Planes at one time. | Experim... | 0-14 | 3.398193 | Toys | Hobbies | Games | 43 |
| 4 | Toys & Games | Puzzles | Jigsaw Puzzles | NaN | 17.49 | Make sure this fits by entering your model num... | 0-14 | 2.917230 | Toys | Puzzles | Games | 36 |
df = sample.groupby('words_len')['log_prices'].mean().reset_index()
tracel = go.Scatter(
x = df['words_len'],
y = df['log_prices'],
mode = 'lines+markers',
name = 'lines+markers'
)
layout = dict(title = 'Average Log_Prices by Description Length',
xaxis = dict(title='Description Length'),
yaxis = dict(title='Average Log_Prices'))
fig = dict(data=[tracel], layout=layout)
py.iplot(fig)
sample['About Product'].isna().sum()
273
sample = sample[pd.notnull(sample['About Product'])]
sample.head()
| Category | Brand Name | Selling Price | About Product | weights_ounces | log_prices | general_cate | sub_cate2 | sub_cate1 | words_len | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Sports & Outdoors | Outdoor Recreation | Skate... | NaN | 237.68 | Make sure this fits by entering your model num... | 114- | 5.475124 | Sports | Outdoor Recreation | Outdoors | 189 |
| 1 | Toys & Games | Learning & Education | Science ... | NaN | 99.95 | Make sure this fits by entering your model num... | 45-69 | 4.614625 | Toys | Learning & Education | Games | 100 |
| 2 | Toys & Games | Arts & Crafts | Craft Kits | NaN | 34.99 | Make sure this fits by entering your model num... | 0-14 | 3.583241 | Toys | Arts & Crafts | Games | 195 |
| 3 | Toys & Games | Hobbies | Models & Model Kits |... | NaN | 28.91 | Make 8 different Planes at one time. | Experim... | 0-14 | 3.398193 | Toys | Hobbies | Games | 43 |
| 4 | Toys & Games | Puzzles | Jigsaw Puzzles | NaN | 17.49 | Make sure this fits by entering your model num... | 0-14 | 2.917230 | Toys | Puzzles | Games | 36 |